import pandas as pd
from scipy import stats
from IPython.core.interactiveshell import InteractiveShell
import hvplot.pandas
pd.options.plotting.backend = "holoviews"
InteractiveShell.ast_node_interactivity = "all"
import numpy as np
import panel as pn
import scipy
from panel.template import DefaultTheme
pn.extension()
from plotnine import ggplot, aes, geom_histogram
cred = {"host": 'localhost', 'dbname': 'yukontaf', 'user': 'glebsokolov', 'password': ''}
from sqlalchemy import create_engine
con = create_engine(
f'postgresql://{cred["user"]}:{cred["password"]}@{cred["host"]}/{cred["dbname"]}'
)
def select(sql):
return pd.read_sql(sql, con)
sql = '''select * from noshowappointment'''
df = select(sql)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 110527 entries, 0 to 110526 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 index 110527 non-null int64 1 Unnamed: 0 110527 non-null int64 2 PatientId 110527 non-null float64 3 Gender 110527 non-null float64 4 ScheduledDay 110527 non-null object 5 AppointmentDay 110527 non-null object 6 Age 110527 non-null int64 7 Neighbourhood 110527 non-null float64 8 Scholarship 110527 non-null int64 9 Hypertension 110527 non-null int64 10 Diabetes 110527 non-null int64 11 Alcoholism 110527 non-null int64 12 Handcap 110527 non-null int64 13 SMS_received 110527 non-null int64 14 No-show 110527 non-null float64 dtypes: float64(4), int64(9), object(2) memory usage: 12.6+ MB
df.head().hvplot.table(width=1480, height=240)
First, let's preprocess the data: make the target variable boolean, make ordinal encoding for the Neighborhood and Gender column, cast columns, containing dates to datetime type, drop AppointmentID (as we wont need it), and I will also calculate a new feature - awaiting time. Then, let's also check the data for outliers and ridiculous records (i.e. negative age and so on)
from sklearn.preprocessing import OrdinalEncoder
del df["Unnamed: 0"]
df[["ScheduledDay", "AppointmentDay"]] = df[["ScheduledDay", "AppointmentDay"]].apply(
lambda x: pd.to_datetime(x)
)
enc = OrdinalEncoder()
df[["Gender", "Neighbourhood", "No-show"]] = enc.fit_transform(
df[["Gender", "Neighbourhood", "No-show"]]
)
df["AwaitingTime"] = -(df["ScheduledDay"] - df["AppointmentDay"]).dt.days
text2 = """After preprocessing our"""
tab2 = df.head().hvplot.table(width=820 * 2, height=420 * 2)
df.query("Age==-1 or AwaitingTime<0")
df.sort_values(by="Age", ascending=True)["Age"].unique()[:10]
df = df.drop(index=df.query("Age==-1 or AwaitingTime<0").index)
| index | PatientId | Gender | ScheduledDay | AppointmentDay | Age | Neighbourhood | Scholarship | Hypertension | Diabetes | Alcoholism | Handcap | SMS_received | No-show | AwaitingTime | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 27033 | 27033 | 7.839273e+12 | 1.0 | 2016-05-10 10:51:53+00:00 | 2016-05-09 00:00:00+00:00 | 38 | 59.0 | 0 | 0 | 0 | 0 | 1 | 0 | 1.0 | -1 |
| 55226 | 55226 | 7.896294e+12 | 0.0 | 2016-05-18 14:50:41+00:00 | 2016-05-17 00:00:00+00:00 | 19 | 69.0 | 0 | 0 | 0 | 0 | 1 | 0 | 1.0 | -1 |
| 64175 | 64175 | 2.425226e+13 | 0.0 | 2016-05-05 13:43:58+00:00 | 2016-05-04 00:00:00+00:00 | 22 | 13.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1.0 | -1 |
| 71533 | 71533 | 9.982316e+14 | 0.0 | 2016-05-11 13:49:20+00:00 | 2016-05-05 00:00:00+00:00 | 81 | 69.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1.0 | -6 |
| 72362 | 72362 | 3.787482e+12 | 1.0 | 2016-05-04 06:50:57+00:00 | 2016-05-03 00:00:00+00:00 | 7 | 78.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1.0 | -1 |
| 99832 | 99832 | 4.659432e+14 | 0.0 | 2016-06-06 08:58:13+00:00 | 2016-06-06 00:00:00+00:00 | -1 | 60.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0 |
array([-1, 0, 1, 2, 3, 4, 5, 6, 7, 8])
As we can see here, we have several recordings with negative awaiting time, we delete them. We also see, that we have a recording with negative age.
Now and further 1 will encode a group who didn't come, and 0 who come.
Now let's view at the distributions of the categorical variables, in order to do so, let's build bar plots, and histogram for age, one more bar plot will include all the neighborhoods.
show_opts = [
1.0,
0.0,
]
show_opt = pn.widgets.Select(name="No-show", options=show_opts)
idf = df.interactive()
data_pipeline = idf[idf["No-show"] == show_opt]
text01 = '''First, let's have a look at the distributions of ages and neighbourhoods'''
viz1 = df["Age"].hvplot(kind="hist", title='Histogram for Ages')
viz1
group = df.groupby(["Neighbourhood", "No-show"]).count()["PatientId"]
viz2 = group.plot.bar(
stacked=True, title="Neighbourhood", rot=90, width=1024
) + group.hvplot.table(width=360)
viz2
Now, lets write a function that will calculate confidence interval for the difference of the two proportions (for the show and not-show groups). Then, write a function that will draw a conclusion given the confidence interval.
def proportions_confint_diff(sample1, sample2, alpha=0.05):
z = scipy.stats.norm.ppf(1 - alpha / 2.0)
p1 = float(sum(sample1)) / len(sample1)
p2 = float(sum(sample2)) / len(sample2)
left_boundary = (p1 - p2) - z * np.sqrt(
p1 * (1 - p1) / len(sample1) + p2 * (1 - p2) / len(sample2)
)
right_boundary = (p1 - p2) + z * np.sqrt(
p1 * (1 - p1) / len(sample1) + p2 * (1 - p2) / len(sample2)
)
return (round(left_boundary*100, 3), round(right_boundary*100, 3))
def calculate_ci(group, alpha):
boundaries = proportions_confint_diff(
df[df[group] == 1]["No-show"], df[df[group] == 0]["No-show"], alpha=alpha
)
if boundaries[0] < 0 and boundaries[1] < 0:
return f"CI is {boundaries}, proportion in the first ({group}=0) group tends to be less"
elif boundaries[0] > 0 and boundaries[1] > 0:
return (
f"CI is {boundaries}, proportion in the first group ({group}=0) tends to be greater"
)
else:
return f"CI is {boundaries}, can not make definitive decision"
from scipy.stats import bootstrap
The function below will draw a bar plot for each categorical variable with a table, where a prior probability for show and no-show situations for each subcategory will be calculated
def group_draw(by):
group = (
df.groupby([by, "No-show"])
.count()[["PatientId"]]
.rename(columns={"PatientId": "Count"})
)
ind = group.index
group = group.reset_index()
group["ShowProb"] = pd.concat(
[
group[group[by] == 0].transform(lambda x: x / sum(x))["Count"],
group[group[by] == 1].transform(lambda x: x / sum(x))["Count"],
]
)
group = group[["Count", "ShowProb"]].set_index(ind)
return group.plot.bar(stacked=True, title=by) + group.hvplot.table(
width=360
)
Now lets's have a look at the barplots for each category combined with the tables where the probability of (no)-show is calculated
viz3 = group_draw("Scholarship")
text3 = calculate_ci("Scholarship", 0.05)
viz3; text3
'CI is (3.095, 4.771), proportion in the first group (Scholarship=0) tends to be greater'
viz4 = group_draw("Gender")
text4 = calculate_ci("Gender", 0.05)
viz4; text4
'CI is (-0.843, 0.147), can not make definitive decision'
viz5 = group_draw("Hypertension")
text5 = calculate_ci("Hypertension", 0.05)
viz5; text5
'CI is (-4.166, -3.029), proportion in the first (Hypertension=0) group tends to be less'
viz6 = group_draw("Diabetes")
text6 = calculate_ci("Diabetes", 0.05)
viz6; text6
'CI is (-3.236, -1.476), proportion in the first (Diabetes=0) group tends to be less'
viz7 = group_draw("Alcoholism")
text7 = calculate_ci("Alcoholism", 0.05)
viz7; text7
'CI is (-1.42, 1.335), can not make definitive decision'
viz8 = group_draw("SMS_received")
text8 = calculate_ci("SMS_received", 0.05)
viz8; text8
'CI is (10.34, 11.413), proportion in the first group (SMS_received=0) tends to be greater'
Now, let's calculate probabilitites of no-show for each age, neighborhood, scheduled hour and awaiting time and see if there are any patterns
probs_age = pd.crosstab(index=df["Age"], columns=df["No-show"])
probs_age["prob_show"] = probs_age.loc[:, 1] / (
probs_age.loc[:, 0] + probs_age.loc[:, 1]
)
viz9 = probs_age.hvplot.scatter(x="Age", y="prob_show")
viz9
probs_place = pd.crosstab(index=df["Neighbourhood"], columns=df["No-show"])
probs_place["prob_show"] = probs_place.loc[:, 1] / (
probs_place.loc[:, 0] + probs_place.loc[:, 1]
)
viz10 = probs_place.hvplot.scatter(x="Neighbourhood", y="prob_show")
viz10
As we can see here age's and neighborhood's probabilities are distributed almost uniformly (except for a single peculiar points for each graph)
probs_hour = (
pd.crosstab(index=df["ScheduledDay"].dt.hour, columns=df["No-show"])
.reset_index()
.rename(columns={"ScheduledDay": "ScheduledHour"})
)
probs_hour["prob_hour"] = probs_hour.loc[:, 1] / (
probs_hour.loc[:, 0] + probs_hour.loc[:, 1]
)
viz11 = probs_hour.hvplot.scatter(x="ScheduledHour", y="prob_hour")
viz11
probs_await = pd.crosstab(index=df["AwaitingTime"], columns=df["No-show"])
probs_await["prob_await"] = probs_await.loc[:, 1] / (
probs_await.loc[:, 0] + probs_await.loc[:, 1]
)
viz12 = probs_await.hvplot.scatter(x="AwaitingTime", y="prob_await")
viz12
# import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
X, y = (
encoder.fit_transform(
df.drop(["ScheduledDay", "AppointmentDay", "PatientId", "No-show"], axis=1)
),
df["No-show"],
)
Now let's train and compair a pair of classifiers that will try to predict our target variable.
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
df.drop(["ScheduledDay", "AppointmentDay", "PatientId", "No-show"], axis=1),
df["No-show"],
test_size=0.2,
)
mclf, bclf = MultinomialNB(), BernoulliNB()
mclf.fit(X_train, y_train)
bclf.fit(X_test, y_test)
print("Accuracy:", round(accuracy_score(y_test, mclf.predict(X_test)), 2) * 100, "%")
print("Accuracy:", round(accuracy_score(y_test, bclf.predict(X_test)), 2) * 100, "%")
MultinomialNB()
BernoulliNB()
Accuracy: 70.0 % Accuracy: 80.0 %
Such a result is excpectable. Bernoulli classifier is well adopted for situations like we have here: a variety of binary features.
df_pane = pn.pane.DataFrame(df.head(), width=1400)
template = pn.template.FastListTemplate(
theme=DefaultTheme,
title="No Show Appointment Interactive Dashboard and Analysis",
sidebar=[
pn.pane.Markdown("# About the project"),
],
main=[
pn.Row(pn.Column(pn.pane.Markdown(text1), df_pane)),
pn.Row(pn.Column(text01, viz1, viz2)),
pn.Row(
pn.Column(
pn.pane.Markdown(code_ci),
text03,
viz3,
text3,
viz4,
text4,
viz5,
text5,
viz6,
text6,
viz7,
text7,
viz8,
text8,
)
),
],
)
template.servable();
template.show()
Launching server at http://localhost:60989
<bokeh.server.server.Server at 0x7fadbb7d1e10>
WARNING:tornado.access:404 GET /static/extensions/panel/bundled/fastbasetemplate/images/header-columns-over-bg.gif (::1) 0.76ms